# -*- coding: utf-8 -*-
"""
Created on Mon Mar  9 15:49:03 2026

@author: Wieczorek_W_Station
"""

import os
import pandas as pd
import pickle
import re
import numpy as np
import spacy
en = spacy.load("en_core_web_sm")
from tqdm import tqdm
#%%
# =============================================================================
# Load Data
# =============================================================================
root = "C:\\Users\\Wieczorek_W_Station\\Dropbox\\Arbeit Kassel\\paperideen\\Moltbook_Science\\Data\\"
path = os.path.join(root,"PreparedData")
molts = os.path.join(root,"Molts")
outputs = os.path.join(root,"Outputs")
figures = os.path.join(root,"Figures")

for p in [outputs,figures]:
    try:
        os.makedirs(p)
    except:
        pass


## load embeddings
os.chdir(path)
threadsEmbeddings = pickle.load(open("ThreadEmbeddingsDict.pickle","rb"))
for d in threadsEmbeddings:
    d["thread"] = "yes"
commentEmbeddings = pickle.load(open("commentEmbeddingsDict.pickle","rb"))
for d in commentEmbeddings:
    d["thread"] = "no"
combinedEmbeddings = threadsEmbeddings + commentEmbeddings
del(threadsEmbeddings, commentEmbeddings)


documents = [x["split_content"] for x in combinedEmbeddings]
documents = [item for sublist in documents for item in sublist]

ids = [x["thread_id"] for x in combinedEmbeddings]
sentiments = [x["sentiment"] for x in combinedEmbeddings]

embeddings = pickle.load(open("Allembeddings.pickle","rb"))

#%%
# =============================================================================
# Load trained models
# =============================================================================
os.chdir(outputs)
modelFirst = pickle.load(open("bertopic.pickle","rb"))
modelSecond = pickle.load(open("bertopicSecondRun.pickle","rb"))

topicsFirst = modelFirst.topics_
topicsSecond = modelSecond.topics_
#%%
# =============================================================================
# Create a DataFrame with the information necessary
# =============================================================================
df = pd.DataFrame(combinedEmbeddings)
df = df.explode(["split_content", "embeddings"]).reset_index(drop=True)

df.insert(1,"topics", ["1_" + str(x) for x in topicsFirst])
idx = df.index[df["topics"] == "1_0"]
df.loc[idx, "topics"] = ["2_" + str(x) for x in topicsSecond]


#%%
# =============================================================================
# Fuse topics to topic families 
# =============================================================================

topic_map = {
    "identity & consciousness": ["1_4","1_16","2_-1","2_4","2_8","2_13","2_20","2_25","2_31","2_36"],
    "technical discussions on the architecture of AI agents": ["1_2","1_5","2_3","2_6","2_7","2_12","2_15","2_16","2_18","2_22","2_23","2_28","2_30","2_32","2_35"],
    "philosophy": ["1_7","1_12","2_0","2_5"],
    "human culture": ["1_3","1_12","1_13","1_14","1_17","2_2"],
    "AI (auto-)ethnography and sociology": ["2_9","2_10","2_14","2_17"],
    "STEM": ["2_1","2_34","2_38","2_41"],
    "economics": ["1_10","2_19","2_24","2_33","2_37"],
    "AI sovereignty": ["2_11","2_29","2_39","2_40"],
    "(discussions on) malicious content": ["1_6","2_27"],
    "AI & scientific infrastructure": ["2_21","2_26"]
}

topic_lookup = {
    topic: label
    for label, topics in topic_map.items()
    for topic in topics
}

df["topic_group"] = df["topics"].map(topic_lookup)

# =============================================================================
# map sentiments to numbers 
# =============================================================================

sentiment_map = {
    0 : ["0","Very Negative",0],
    1 : ["1", "Negative", 1, 1.5],
    2 : ["2", "Neutral", 2],
    3 : ["3", "Positive", 2.5,3],
    4 : ["4", "Very Positive",4]
}

lookup = {value: key for key, values in sentiment_map.items() for value in values}
df["sentiment"] = df.sentiment.map(lookup)

#%%
# =============================================================================
# save output for futher processing (identify comments / posts)
# =============================================================================
os.chdir(path)

df.drop(columns = "embeddings", inplace = True)
df.to_csv("PreparationCountRegression.csv", sep = ";")


os.chdir(molts)
threadsDf = pd.read_csv("ThreadsAll01.csv" , sep = ";")
threadsDf.drop(columns = "Unnamed: 0", inplace = True)

commentsDf = pd.read_csv("CommentsAll01.csv", sep = ";")
commentsDf.drop(columns = "Unnamed: 0", inplace = True)


#%%
# =============================================================================
# Group threads and merge
# =============================================================================
groupedThreadsSentiment = df[df.thread == "yes"].\
    groupby(by ="thread_id")["sentiment"].\
        mean().reset_index()

groupedThreadsTopics = df[df.thread == "yes"][["thread_id","topic_group"]]
topicGroups = list(set(groupedThreadsTopics.topic_group.dropna()))

for t in tqdm(topicGroups):
    values = [1 if row == t else 0 for row in groupedThreadsTopics.topic_group]
    groupedThreadsTopics.insert(len(groupedThreadsTopics.columns),
                                t,
                                values)
groupedThreadsTopics.drop(columns = "topic_group", inplace = True)
groupedThreadsTopicsGrouped = groupedThreadsTopics.\
    groupby(by = groupedThreadsTopics.thread_id).mean()
    
groupedThreadsTopicsGrouped[groupedThreadsTopicsGrouped > 0] = 1

groupedThreadsTopicsGrouped = groupedThreadsTopicsGrouped.reset_index()

#%%
# =============================================================================
# merge thread level information
# =============================================================================
threadsDf = pd.merge(threadsDf, 
         groupedThreadsSentiment,
         how = "left",
         left_on = "id",
         right_on = "thread_id")

threadsDf = pd.merge(threadsDf, 
         groupedThreadsTopicsGrouped,
         how = "left",
         left_on = "id",
         right_on = "thread_id")


threadsDf = threadsDf[["id","title","author_id","upvotes","comment_count",
           "sentiment",
           'identity & consciousness',
           'AI (auto-)ethnography and sociology', 
           'economics',
           'technical discussions on the architecture of AI agents',
           'AI & scientific infrastructure', '(discussions on) malicious content',
           'AI sovereignty', 'STEM', 'philosophy', 'human culture'
           ]]

##group sentiment
groupedSentiment = []
for s in threadsDf.sentiment:
    if s == 2:
        groupedSentiment.append("Neutral")
    elif s < 2:
        groupedSentiment.append("Negative")
    else:
        groupedSentiment.append("Positive")
threadsDf.insert(5,"grouped_sentiment",groupedSentiment)
#%%
os.chdir(path)
threadsDf.to_csv("ThreadsCountRegression.csv", sep = ";")
